In [1]:
# %load database-reader.py
import sqlite3
import pandas as pd
from xml.etree import ElementTree
Origin. 3040 raw match obersvations for English Premier League After merged with team arribute. it reduces to 2393 observations.
Missing Data: 1 from match table. we discard PS, SJ, GB, BS betting company daa + player featuures. focus on the Primary League only. 2 discard additional four lines with empty features
In [2]:
db_location = "soccer/database.sqlite"
conn = sqlite3.connect(db_location)
In [3]:
df_match = pd.read_sql(sql="SELECT id,league_id, season, stage, date, match_api_id, home_team_api_id, away_team_api_id, home_team_goal, away_team_goal, goal, shoton, shotoff, foulcommit, card, cross, corner, possession, B365H, B365D, B365A, BWH, BWD, BWA, IWH, IWD, IWA, LBH, LBD, LBA, WHH,WHD,WHA,VCH,VCD,VCA FROM Match WHERE league_id=1729 AND date >= '2010-02-22'", con=conn)
df_match=df_match.dropna()
df_match.head()
Out[3]:
In [4]:
frames = list()
for index, row in df_match.iterrows():
hteam=row.home_team_api_id
ateam=row.away_team_api_id
date=row.date ##match date
hquery="SELECT * FROM team_attributes WHERE team_api_id="+str(hteam)
aquery="SELECT * FROM team_attributes WHERE team_api_id="+str(ateam)
df_hteam=pd.read_sql(sql=hquery,con=conn)
df_ateam=pd.read_sql(sql=aquery,con=conn)
df_hteam=df_hteam[df_hteam.date<=date]
df_ateam=df_ateam[df_ateam.date<=date]
df_hlastest= df_hteam.tail(1)
df_alastest= df_ateam.tail(1)
df_row=row.to_frame().T
##print df_row
df_row_home=pd.merge(df_row, df_hlastest, how='outer',left_on='home_team_api_id', right_on='team_api_id', suffixes=('_match', '_home'))
df_row_home_away=pd.merge(df_row_home,df_alastest, how='outer',left_on='away_team_api_id', right_on='team_api_id', suffixes=('_home', '_away'))
frames.append(df_row_home_away)
##hteam_frames.append(df_hlastest)
##ateam_frames.append(df_alastest)
df_match_team = pd.concat(frames).reset_index(drop=True)
#df_ateams = pd.concat(ateam_frames)
df_match_team.head()
Out[4]:
In [5]:
# Replace possession XML with the latest value
for index, row in df_match_team.iterrows():
tree = ElementTree.fromstring(row['possession'])
elapsed = -1
possession = -1
for value in tree:
for tag in value:
if tag.tag == 'elapsed':
curelapsed = int(tag.text)
if tag.tag == 'homepos':
curpossession = tag.text
if elapsed <= curelapsed:
elapsed = curelapsed
possession = curpossession
df_match_team.loc[index, 'possession'] = possession
df_match_team = df_match_team.rename(columns = {'possession' : 'possession_home'})
In [6]:
def parse_xml(field):
global df_match_team
# Replace corner XML with the count
if field == 'card':
df_match_team['ycard_home'] = 0
df_match_team['ycard_away'] = 0
df_match_team['rcard_home'] = 0
df_match_team['rcard_away'] = 0
else:
df_match_team['%s_home' % field] = 0
df_match_team['%s_away' % field] = 0
if field == 'cross':
df_match_team['throwin_home'] = 0
df_match_team['throwin_away'] = 0
for index, row in df_match_team.iterrows():
tree = ElementTree.fromstring(row[field])
for value in tree:
ignore = False
away = ''
cardtype = ''
field_save = field
for tag in value:
if tag.tag == 'team':
current_team = int(tag.text)
if current_team == row['home_team_api_id']:
away = 'home'
elif current_team == row['away_team_api_id']:
away = 'away'
else:
print (current_team)
print (row)
assert False
if (tag.tag == 'card_type' or tag.tag == 'comment') and field == 'card':
cardtype = tag.text
if cardtype == 'y2':
cardtype = 'r'
if tag.tag == 'type':
if tag.text != field:
if field == 'cross' and tag.text == 'throwin':
field_save = tag.text
else:
if tag.text != 'corner':
print ('%s instead of %s' % (tag.text, field))
ignore = True
if not ignore and (away == 'away' or away == 'home'):
df_match_team.loc[index, '%s%s_%s' % (cardtype, field_save, away)] += 1
df_match_team = df_match_team.drop(field, axis=1)
parse_xml('corner')
parse_xml('cross')
parse_xml('shoton')
parse_xml('shotoff')
parse_xml('foulcommit')
parse_xml('card')
In [7]:
# Drop IDs and dates
df_match_team=df_match_team.drop(labels = ['date', 'id', 'id_match', 'league_id', 'season', 'date_match', 'match_api_id',
'home_team_api_id', 'away_team_api_id', 'team_fifa_api_id_home',
'team_api_id_home', 'team_fifa_api_id_away', 'team_api_id_away',
'id_home', 'date_home', 'goal'], axis = 1)
# Use Classes to fill missing values
replacements = []
for index, row in df_match_team.iterrows():
for column in df_match_team:
if pd.isnull(row[column]):
classColumn = column[:-5] + 'Class' + column[-5:]
if classColumn in df_match_team:
replacement = df_match_team[column][df_match_team[classColumn] == row[classColumn]].median()
replacements.append('In %s: replaced %s with %f' % (column, row[classColumn], replacement))
df_match_team.loc[index, column] = replacement
for r in set(replacements):
print (r)
# Drop columns that still have missing values
df_match_team=df_match_team.dropna(axis = 1)
# Drop Classes what have numerical equvialents
for column in df_match_team:
classColumn = column[:-5] + 'Class' + column[-5:]
if classColumn in df_match_team:
df_match_team = df_match_team.drop(classColumn, axis = 1)
In [8]:
for columnName in df_match_team:
no_unique = len(df_match_team[columnName].unique())
if no_unique == 1:
print ('Dropping %s because it has only one 1 unique value' % columnName)
df_match_team = df_match_team.drop(columnName, axis = 1)
if no_unique == 2:
mode = df_match_team[columnName].mode().values[0]
df_match_team.loc[:,columnName] = df_match_team[columnName] == mode
df_match_team = df_match_team.rename(columns={columnName: columnName + '_is' + mode})
print ('Binarizing %s to %s_is%s' % (columnName, columnName, mode))
In [9]:
df_match_team.to_csv('dataset.csv', index=None)
df_match_team.dtypes
Out[9]:
In [10]:
conn.close()